acfdf <- function(vec) {
vacf <- acf(vec, plot = F)
with(vacf, data.frame(lag, acf))
}
ggacf <- function(vec) {
ac <- acfdf(vec)
ggplot(data = ac, aes(x = lag, y = acf)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0))
}
tplot <- function(vec) {
df <- data.frame(X = vec, t = seq_along(vec))
ggplot(data = df, aes(x = t, y = X)) + geom_line()
}Biostat 212a Homework 6
Due Mar 22, 2024 @ 11:59PM
Load R libraries.
1 New York Stock Exchange (NYSE) data (1962-1986) (140 pts)
The NYSE.csv file contains three daily time series from the New York Stock Exchange (NYSE) for the period Dec 3, 1962-Dec 31, 1986 (6,051 trading days).
Log trading volume(\(v_t\)): This is the fraction of all outstanding shares that are traded on that day, relative to a 100-day moving average of past turnover, on the log scale.Dow Jones return(\(r_t\)): This is the difference between the log of the Dow Jones Industrial Index on consecutive trading days.Log volatility(\(z_t\)): This is based on the absolute values of daily price movements.
# Read in NYSE data from url
url = "https://raw.githubusercontent.com/ucla-biostat-212a/2024winter/master/slides/data/NYSE.csv"
NYSE <- read_csv(url)
NYSE# A tibble: 6,051 × 6
date day_of_week DJ_return log_volume log_volatility train
<date> <chr> <dbl> <dbl> <dbl> <lgl>
1 1962-12-03 mon -0.00446 0.0326 -13.1 TRUE
2 1962-12-04 tues 0.00781 0.346 -11.7 TRUE
3 1962-12-05 wed 0.00384 0.525 -11.7 TRUE
4 1962-12-06 thur -0.00346 0.210 -11.6 TRUE
5 1962-12-07 fri 0.000568 0.0442 -11.7 TRUE
6 1962-12-10 mon -0.0108 0.133 -10.9 TRUE
7 1962-12-11 tues 0.000124 -0.0115 -11.0 TRUE
8 1962-12-12 wed 0.00336 0.00161 -11.0 TRUE
9 1962-12-13 thur -0.00330 -0.106 -11.0 TRUE
10 1962-12-14 fri 0.00447 -0.138 -11.0 TRUE
# ℹ 6,041 more rows
The autocorrelation at lag \(\ell\) is the correlation of all pairs \((v_t, v_{t-\ell})\) that are \(\ell\) trading days apart. These sizable correlations give us confidence that past values will be helpful in predicting the future.
Code
ggacf(NYSE$log_volume) + ggthemes::theme_few()Do a similar plot for (1) the correlation between \(v_t\) and lag \(\ell\) Dow Jones return \(r_{t-\ell}\) and (2) correlation between \(v_t\) and lag \(\ell\) Log volatility \(z_{t-\ell}\).
Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$DJ_return, x),
use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `DJ return`")Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$log_volatility, x),
use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `log volatility`")1.1 Project goal
Our goal is to forecast daily Log trading volume, using various machine learning algorithms we learnt in this class.
The data set is already split into train (before Jan 1st, 1980, \(n_{\text{train}} = 4,281\)) and test (after Jan 1st, 1980, \(n_{\text{test}} = 1,770\)) sets.
In general, we will tune the lag \(L\) to acheive best forecasting performance. In this project, we would fix \(L=5\). That is we always use the previous five trading days’ data to forecast today’s log trading volume.
Pay attention to the nuance of splitting time series data for cross validation. Study and use the time-series functionality in tidymodels. Make sure to use the same splits when tuning different machine learning algorithms.
Use the \(R^2\) between forecast and actual values as the cross validation and test evaluation criterion.
1.2 Baseline method (20 pts)
We use the straw man (use yesterday’s value of log trading volume to predict that of today) as the baseline method. Evaluate the \(R^2\) of this method on the test data.
# Lag: look back L trading days
# Do not need to include, as we included them in receipe
L = 5
for(i in seq(1, L)) {
NYSE <- NYSE %>%
mutate(!!paste("DJ_return_lag", i, sep = "") :=
lag(NYSE$DJ_return, i),
!!paste("log_volume_lag", i, sep = "") :=
lag(NYSE$log_volume, i),
!!paste("log_volatility_lag", i, sep = "") :=
lag(NYSE$log_volatility, i))
}
NYSE <- NYSE %>% na.omit()# Drop beginning trading days which lack some lagged variables
NYSE_other <- NYSE %>%
filter(train == 'TRUE') %>%
select(-train) %>%
drop_na()
dim(NYSE_other)[1] 4276 20
NYSE_test = NYSE %>%
filter(train == 'FALSE') %>%
select(-train) %>%
drop_na()
dim(NYSE_test)[1] 1770 20
library(yardstick)
# cor(NYSE_test$log_volume, NYSE_test$log_volume_lag1) %>% round(2)
r2_test_strawman = rsq_vec(NYSE_test$log_volume,
lag(NYSE_test$log_volume, 1)) %>%
round(2)
print(paste("Straw man test R2: ", r2_test_strawman))[1] "Straw man test R2: 0.35"
1.3 Autoregression (AR) forecaster (30 pts)
Let \[ y = \begin{pmatrix} v_{L+1} \\ v_{L+2} \\ v_{L+3} \\ \vdots \\ v_T \end{pmatrix}, \quad M = \begin{pmatrix} 1 & v_L & v_{L-1} & \cdots & v_1 \\ 1 & v_{L+1} & v_{L} & \cdots & v_2 \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & v_{T-1} & v_{T-2} & \cdots & v_{T-L} \end{pmatrix}. \]
Fit an ordinary least squares (OLS) regression of \(y\) on \(M\), giving \[ \hat v_t = \hat \beta_0 + \hat \beta_1 v_{t-1} + \hat \beta_2 v_{t-2} + \cdots + \hat \beta_L v_{t-L}, \] known as an order-\(L\) autoregression model or AR(\(L\)).
Before we start the model training, let’s talk about time series resampling. We will use the
rolling_originfunction in thersamplepackage to create a time series cross-validation plan.When the data have a strong time component, a resampling method should support modeling to estimate seasonal and other temporal trends within the data. A technique that randomly samples values from the training set can disrupt the model’s ability to estimate these patterns.
NYSE %>%
ggplot(aes(x = date,
y = log_volume)) +
geom_line() +
geom_smooth(method = "lm")wrong_split <- initial_split(NYSE_other)
bind_rows(
training(wrong_split) %>% mutate(type = "train"),
testing(wrong_split) %>% mutate(type = "test")
) %>%
ggplot(aes(x = date,
y = log_volume,
color = type,
group = NA)) +
geom_line()correct_split <-
initial_time_split(NYSE_other %>%
arrange(date))
bind_rows(
training(correct_split) %>%
mutate(type = "train"),
testing(correct_split) %>%
mutate(type = "test")
) %>%
ggplot(aes(x = date,
y = log_volume,
color = type,
group = NA)) +
geom_line()rolling_origin(NYSE_other %>% arrange(date),
initial = 30,
assess = 7) %>%
#sliding_period(NYSE_other %>% arrange(date), date, period = "day", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits,
analysis),
test_data = map(splits,
assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice0001", "Slice0002", "Slice0003")) %>%
unnest(value) %>%
ggplot(aes(x = date,
y = log_volume,
color = name,
group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")sliding_period(NYSE_other %>% arrange(date),
date,
period = "month",
lookback = Inf,
assess_stop = 1) %>%
mutate(train_data = map(splits,
analysis),
test_data = map(splits,
assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice001", "Slice002", "Slice003")) %>%
unnest(value) %>%
ggplot(aes(x = date,
y = log_volume,
color = name,
group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")Rolling forecast origin resampling (Hyndman and Athanasopoulos 2018) provides a method that emulates how time series data is often partitioned in practice, estimating the model with historical data and evaluating it with the most recent data.
Tune AR(5) with elastic net (lasso + ridge) regularization using all 3 features on the training data, and evaluate the test performance.
1.4 Preprocessing
en_receipe <-
recipe(log_volume ~ .,
data = NYSE_other) %>%
step_dummy(all_nominal(),
-all_outcomes()) %>%
step_normalize(all_numeric_predictors(),
-all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)1.5 Model training
# Model
enet_mod <-
# mixture = 0 (ridge), mixture = 1 (lasso)
# mixture = (0, 1) elastic net
linear_reg(penalty = tune(),
mixture = tune()) %>%
set_engine("glmnet")
# Workflow
en_wf <-
workflow() %>%
add_model(enet_mod) %>%
add_recipe(en_receipe %>%
step_rm(date) %>%
step_indicate_na())
# Folds
folds <- NYSE_other %>%
arrange(date) %>%
sliding_period(date,
period = "month",
lookback = Inf,
assess_stop = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)# Tuning Grid
lambda_grid <-
grid_regular(penalty(range = c(-8, -7),
trans = log10_trans()),
mixture(),
levels = 3)
lambda_grid# A tibble: 9 × 2
penalty mixture
<dbl> <dbl>
1 0.00000001 0
2 0.0000000316 0
3 0.0000001 0
4 0.00000001 0.5
5 0.0000000316 0.5
6 0.0000001 0.5
7 0.00000001 1
8 0.0000000316 1
9 0.0000001 1
en_fit <-
tune_grid(en_wf,
resamples = month_folds,
grid = lambda_grid) %>%
collect_metrics()
en_fit# A tibble: 18 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mo…
2 0.00000001 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mo…
3 0.0000000316 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mo…
4 0.0000000316 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mo…
5 0.0000001 0 rmse standard 0.138 200 0.00297 Preprocessor1_Mo…
6 0.0000001 0 rsq standard 0.339 200 0.0147 Preprocessor1_Mo…
7 0.00000001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
8 0.00000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
9 0.0000000316 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
10 0.0000000316 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
11 0.0000001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
12 0.0000001 0.5 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
13 0.00000001 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
14 0.00000001 1 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
15 0.0000000316 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
16 0.0000000316 1 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
17 0.0000001 1 rmse standard 0.133 200 0.00287 Preprocessor1_Mo…
18 0.0000001 1 rsq standard 0.381 200 0.0149 Preprocessor1_Mo…
# Best model
best_en_ny <- en_fit %>%
filter(.metric == "rmse") %>%
slice(which.min(mean))
best_en_ny# A tibble: 1 × 8
penalty mixture .metric .estimator mean n std_err .config
<dbl> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 0.5 rmse standard 0.133 200 0.00287 Preprocessor1_Model4
# CV R^2
cv_en_rsq <- en_fit %>%
filter(.config == best_en_ny$.config,
.metric == "rsq") %>%
pull(mean)
cv_en_rsq[1] 0.3807083
# Final workflow
final_en_wf <- en_wf %>%
finalize_workflow(best_en_ny)
fit_model <- fit(final_en_wf,
data = NYSE_other)
en_predict <- predict(fit_model,
NYSE_test)
en_results <- bind_cols(NYSE_test %>%
select(log_volume),
en_predict)
en_rsq <- rsq(en_results,
truth = log_volume,
estimate = .pred)
en_rsq# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rsq standard 0.554
- Hint: Workflow: Lasso is a good starting point.
1.6 Random forest forecaster (30pts)
- Use the same features as in AR(\(L\)) for the random forest. Tune the random forest and evaluate the test performance.
### Model
rf_ny_mod <-
rand_forest(
mode = "regression",
# Number of predictors randomly sampled in each split
mtry = tune(),
# Number of trees in ensemble
trees = tune()
) %>%
set_engine("ranger")
### Workflow
rf_ny_wf <-
workflow() %>%
add_model(rf_ny_mod) %>%
add_recipe(en_receipe %>%
step_rm(date) %>%
step_indicate_na())
### Folds
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month",
lookback = Inf,
assess_stop = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)
### Tuning Grid
rf_ny_grid <-
grid_regular(
trees(range = c(100L, 300L)),
mtry(range = c(1L, 5L)),
levels = c(3, 5)
)
rf_ny_grid# A tibble: 15 × 2
trees mtry
<int> <int>
1 100 1
2 200 1
3 300 1
4 100 2
5 200 2
6 300 2
7 100 3
8 200 3
9 300 3
10 100 4
11 200 4
12 300 4
13 100 5
14 200 5
15 300 5
rf_ny_fit <-
tune_grid(rf_ny_wf,
resamples = month_folds,
grid = rf_ny_grid) %>%
collect_metrics()
rf_ny_fit# A tibble: 30 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 100 rmse standard 0.161 200 0.00389 Preprocessor1_Model01
2 1 100 rsq standard 0.303 200 0.0142 Preprocessor1_Model01
3 1 200 rmse standard 0.161 200 0.00389 Preprocessor1_Model02
4 1 200 rsq standard 0.311 200 0.0143 Preprocessor1_Model02
5 1 300 rmse standard 0.161 200 0.00387 Preprocessor1_Model03
6 1 300 rsq standard 0.317 200 0.0145 Preprocessor1_Model03
7 2 100 rmse standard 0.147 200 0.00329 Preprocessor1_Model04
8 2 100 rsq standard 0.327 200 0.0146 Preprocessor1_Model04
9 2 200 rmse standard 0.147 200 0.00331 Preprocessor1_Model05
10 2 200 rsq standard 0.329 200 0.0146 Preprocessor1_Model05
# ℹ 20 more rows
# Best model
best_rf_ny <- rf_ny_fit %>%
filter(.metric == "rmse") %>%
# Select the row with the highest rsq value
slice(which.min(mean))
best_rf_ny# A tibble: 1 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 300 rmse standard 0.140 200 0.00296 Preprocessor1_Model15
# CV R^2
cv_rf_rsq <- rf_ny_fit %>%
filter(.config == best_rf_ny$.config,
.metric == "rsq") %>%
pull(mean)
cv_rf_rsq[1] 0.34083
# Final workflow
final_rm_ny_wf <- rf_ny_wf %>%
finalize_workflow(best_rf_ny)
fit_rm_ny_model <- fit(final_rm_ny_wf,
data = NYSE_other)
rm_predict <- predict(fit_rm_ny_model,
NYSE_test)
rm_results <- bind_cols(NYSE_test %>%
select(log_volume),rm_predict)
rm_rsq <- rsq(rm_results,
truth = log_volume,
estimate = .pred)
rm_rsq# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rsq standard 0.498
- Hint: Workflow: Random Forest for Prediction is a good starting point.
1.7 Boosting forecaster (30pts)
- Use the same features as in AR(\(L\)) for the boosting. Tune the boosting algorithm and evaluate the test performance.
# Model
gb_ny_mod <-
boost_tree(
mode = "regression",
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost")
# Workflow
gb_ny_wf <-
workflow() %>%
add_model(gb_ny_mod) %>%
add_recipe(en_receipe %>%
step_rm(date) %>%
step_indicate_na())
# Folds
folds <- NYSE_other %>%
arrange(date) %>%
sliding_period(date,
period = "month",
lookback = Inf,
assess_stop = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)# Tuning Grid
gb_ny_grid <-
grid_regular(
tree_depth(range = c(1L, 4L)),
learn_rate(range = c(-3, -0.5),
trans = log10_trans()),
levels = c(3, 3))
gb_ny_grid# A tibble: 9 × 2
tree_depth learn_rate
<int> <dbl>
1 1 0.001
2 2 0.001
3 4 0.001
4 1 0.0178
5 2 0.0178
6 4 0.0178
7 1 0.316
8 2 0.316
9 4 0.316
# Fit cross-validation
gb_ny_fit <-
tune_grid(gb_ny_wf,
resamples = month_folds,
grid = gb_ny_grid) %>%
collect_metrics()
gb_ny_fit# A tibble: 18 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 0.001 rmse standard 0.257 200 0.00562 Preprocessor1_M…
2 1 0.001 rsq standard 0.203 200 0.0120 Preprocessor1_M…
3 2 0.001 rmse standard 0.251 200 0.00502 Preprocessor1_M…
4 2 0.001 rsq standard 0.239 200 0.0126 Preprocessor1_M…
5 4 0.001 rmse standard 0.246 200 0.00470 Preprocessor1_M…
6 4 0.001 rsq standard 0.292 200 0.0138 Preprocessor1_M…
7 1 0.0178 rmse standard 0.139 200 0.00296 Preprocessor1_M…
8 1 0.0178 rsq standard 0.349 200 0.0146 Preprocessor1_M…
9 2 0.0178 rmse standard 0.134 200 0.00276 Preprocessor1_M…
10 2 0.0178 rsq standard 0.383 200 0.0154 Preprocessor1_M…
11 4 0.0178 rmse standard 0.135 200 0.00281 Preprocessor1_M…
12 4 0.0178 rsq standard 0.388 200 0.0154 Preprocessor1_M…
13 1 0.316 rmse standard 0.141 200 0.00312 Preprocessor1_M…
14 1 0.316 rsq standard 0.358 200 0.0151 Preprocessor1_M…
15 2 0.316 rmse standard 0.146 200 0.00303 Preprocessor1_M…
16 2 0.316 rsq standard 0.340 200 0.0142 Preprocessor1_M…
17 4 0.316 rmse standard 0.145 200 0.00286 Preprocessor1_M…
18 4 0.316 rsq standard 0.338 200 0.0145 Preprocessor1_M…
# best model
best_gb_ny <- gb_ny_fit %>%
filter(.metric == "rmse") %>%
filter(mean == min(mean))
best_gb_ny# A tibble: 1 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 0.0178 rmse standard 0.134 200 0.00276 Preprocessor1_Mo…
# CV R^2
selected_tree_depth <- best_gb_ny$tree_depth
selected_learn_rate <- best_gb_ny$learn_rate
cv_gb_R <- gb_ny_fit %>%
filter(tree_depth == selected_tree_depth,
learn_rate == selected_learn_rate,
.metric == "rsq")
cv_gb_R# A tibble: 1 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 0.0178 rsq standard 0.383 200 0.0154 Preprocessor1_Mo…
# Final workflow
final_gb_ny_wf <- gb_ny_wf %>%
finalize_workflow(best_gb_ny)
fit_gb_ny_model <- fit(final_gb_ny_wf, data = NYSE_other)
gb_predict <- predict(fit_gb_ny_model, NYSE_test)
gb_results <- bind_cols(NYSE_test %>% select(log_volume),gb_predict)
gb_rsq <- rsq(gb_results, truth = log_volume, estimate = .pred)
gb_rsq# A tibble: 1 × 3
.metric .estimator .estimate
<chr> <chr> <dbl>
1 rsq standard 0.537
- Hint: Workflow: Boosting tree for Prediction is a good starting point.
1.8 Summary (30pts)
Your score for this question is largely determined by your final test performance.
Summarize the performance of different machine learning forecasters in the following format.
| Method | CV \(R^2\) | Test \(R^2\) | |
|---|---|---|---|
| Baseline | |||
| AR(5) | 0.38 | 0.55 | |
| Random Forest | 0.34 | 0.50 | |
| Boosting | 0.38 | 0.54 |
1.9 Extension reading
2 ISL Exercise 12.6.13 (90 pts)
On the book website, www.statlearning.com, there is a gene expression data set (Ch12Ex13.csv) that consists of 40 tissue samples with measurements on 1,000 genes. The first 20 samples are from healthy patients, while the second 20 are from a diseased group.
2.1 12.6.13 (b) (30 pts)
- Apply hierarchical clustering to the samples using correlation- based distance, and plot the dendrogram. Do the genes separate the samples into the two groups? Do your results depend on the type of linkage used?
data <- read.csv("Ch12Ex13.csv",
header = FALSE)
colnames(data) <- c(paste0("H", 1:20),
paste0("D", 1:20))hc.complete <- hclust(as.dist(1 - cor(data)),
method = "complete")
plot(hc.complete)hc.complete <- hclust(as.dist(1 - cor(data)),
method = "average")
plot(hc.complete)hc.complete <- hclust(as.dist(1 - cor(data)),
method = "single")
plot(hc.complete)Answer: The genes separate the samples into the two groups. The results depend on the type of linkage used.
2.2 PCA and UMAP (30 pts)
PCA
set.seed(101)
pr.gene <- prcomp(t(data), scale = T)
plot(pr.gene)total.load <- apply(pr.gene$rotation, 1, sum)
index <- order(abs(total.load), decreasing = TRUE)
index[1:10] [1] 2 755 889 676 960 475 907 174 878 840
UMAP
library(umap)
library(uwot)
gene.umap <- umap::umap(data)
gene.umap
head(gene.umap$layout) [,1] [,2]
[1,] -1.6056929 1.2579899
[2,] -2.0475020 2.2985369
[3,] 0.8427871 1.2266204
[4,] 0.0279352 2.8620457
[5,] 1.4774538 1.6012876
[6,] 0.5980590 -0.1256017
plot(gene.umap$layout,
xlab = "UMAP_1", ylab = "UMAP_2",
main = "A UMAP visualization of the gene dataset"
)2.3 12.6.13 (c) (30 pts)
- Your collaborator wants to know which genes differ the most across the two groups. Suggest a way to answer this question, and apply it here.
grp = factor(rep(c(1, 0), each = 20))
regression <- function(y) {
sum <- summary(lm(y ~ grp))
pv <- sum$coefficients[2, 4]
return(pv)
}
out <- tibble(gene = seq(1, nrow(data)),
p_values = unlist(purrr:: map(1:nrow(data),
~regression(as.matrix
(data)[.x, ]))))out %>% arrange(p_values) %>% head(10)# A tibble: 10 × 2
gene p_values
<int> <dbl>
1 502 1.43e-12
2 589 3.19e-12
3 600 9.81e-12
4 590 6.28e-11
5 565 1.74e-10
6 551 1.10e- 9
7 593 1.19e- 9
8 584 1.65e- 9
9 538 2.42e- 9
10 569 2.69e- 9
library(pheatmap)
library(ggplotify) ## to convert pheatmap to ggplot2
library(heatmaply) ## for constructing interactive heatmap# sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05/nrow(Ch12Ex13))
sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05 )#create data frame for annotations
dfh <- data.frame(sample=as.character(colnames(data)), status = "disease") %>%
column_to_rownames("sample")
dfh$status[seq(21, 40)] <- "healthy"
dfh status
H1 disease
H2 disease
H3 disease
H4 disease
H5 disease
H6 disease
H7 disease
H8 disease
H9 disease
H10 disease
H11 disease
H12 disease
H13 disease
H14 disease
H15 disease
H16 disease
H17 disease
H18 disease
H19 disease
H20 disease
D1 healthy
D2 healthy
D3 healthy
D4 healthy
D5 healthy
D6 healthy
D7 healthy
D8 healthy
D9 healthy
D10 healthy
D11 healthy
D12 healthy
D13 healthy
D14 healthy
D15 healthy
D16 healthy
D17 healthy
D18 healthy
D19 healthy
D20 healthy
pheatmap(data[sig$gene, ], cluster_rows = FALSE, cluster_cols = T, scale="row", annotation_col = dfh,
annotation_colors=list(status = c(disease = "orange", healthy = "black")),
color=colorRampPalette(c("navy", "white", "red"))(50))